### Random Forests

install.packages("readstata13")
library(readstata13)


#######################################################################################

## Time Use

## Income-generating

Data <- read.dta13("C:/Users/ge79jas/Dokumente/Post Doc Goettingen/MaxART Economic Outcomes/Dataset/Timeuse_R.dta")

View(Data)


myvars <-  c("AforA" ,"totalprod24" ,"female", "education" ,"married", "age" ,  "months_since_HIV_dx",  "months_since_ART_start", "period", "SiteCode")
Data <- Data[myvars]
Data <- na.omit(Data)

View(Data)
install.packages("grf")
library("grf")


install.packages("tidyverse")
library("tidyverse")

## Make A forest


cf <- causal_forest(
  X = model.matrix(~ ., data = Data[, -c(1,2)], clusters = Data$SiteCode),
  Y = Data$totalprod24,
  W = as.numeric(Data$AforA) - 1,
  num.trees = 5000,
  seed = 1839
)

## Make predictions

preds <- predict(
  object = cf, 
  newdata = model.matrix(~ ., data = Data[, -c(1,2)]), 
  estimate.variance = TRUE
)


Data$preds <- preds$predictions 


## Explore the Nature of Heterogeneity
# What variables affect variations in treatment effects? Rank-order their importance 
cf %>% 
  variable_importance() %>% 
  as.data.frame() %>% 
  mutate(variable = colnames(cf$X.orig)) %>% 
  arrange(desc(V1))

install.packages("ggplot2")
library(ggplot2)
install.packages("cowplot")
library(cowplot)

p1 <- ggplot(Data, aes(x =months_since_ART_start, y = preds)) +
  geom_point() +
  geom_smooth(method = "loess", span = 1) +
  theme_light()
p2 <- ggplot(Data, aes(x = months_since_HIV_dx, y = preds)) +
  geom_point() +
  geom_smooth(method = "loess", span = 1) +
  theme_light()

p3 <- ggplot(Data, aes(x = age, y = preds)) +
  geom_point() +
  geom_smooth(method = "loess", span = 1) +
  theme_light()

p4 <- ggplot(Data, aes(x = education, y = preds)) +
  geom_point() +
  geom_smooth(method = "loess", span = 1) +
  theme_light()

library("cowplot")
cowplot::plot_grid(p1, p2, p3, p4)
cowplot::plot_grid(p1)
####################################################################

##Non-resting

library(readstata13)
Data <- read.dta13("C:/Users/ge79jas/Dokumente/Post Doc Goettingen/MaxART Economic Outcomes/Dataset/Timeuse_R.dta")

View(Data)

myvars <-  c("AforA" ,"activity24" ,"female", "education" ,"married", "age" ,  "months_since_HIV_dx",  "months_since_ART_start", "period", "SiteCode")
Data <- Data[myvars]
Data <- na.omit(Data)

View(Data)
install.packages("grf")
library("grf")


install.packages("tidyverse")
library("tidyverse")

## Make A forest


cf <- causal_forest(
  X = model.matrix(~ ., data = Data[, -c(1,2)], clusters = Data$SiteCode),
  Y = Data$activity24,
  W = as.numeric(Data$AforA) - 1,
  num.trees = 5000,
  seed = 1839
)

## Make predictions

preds <- predict(
  object = cf, 
  newdata = model.matrix(~ ., data = Data[, -c(1,2)]), 
  estimate.variance = TRUE
)


Data$preds <- preds$predictions 


## Explore the Nature of Heterogeneity
# What variables affect variations in treatment effects? Rank-order their importance 
cf %>% 
  variable_importance() %>% 
  as.data.frame() %>% 
  mutate(variable = colnames(cf$X.orig)) %>% 
  arrange(desc(V1))

install.packages("ggplot2")
library(ggplot2)
install.packages("cowplot")
library(cowplot)

p1 <- ggplot(Data, aes(x =months_since_ART_start, y = preds)) +
  geom_point() +
  geom_smooth(method = "loess", span = 1) +
  theme_light()
p2 <- ggplot(Data, aes(x = months_since_HIV_dx, y = preds)) +
  geom_point() +
  geom_smooth(method = "loess", span = 1) +
  theme_light()

p3 <- ggplot(Data, aes(x = age, y = preds)) +
  geom_point() +
  geom_smooth(method = "loess", span = 1) +
  theme_light()

p4 <- ggplot(Data, aes(x = education, y = preds)) +
  geom_point() +
  geom_smooth(method = "loess", span = 1) +
  theme_light()

library("cowplot")
cowplot::plot_grid(p1, p2, p3, p4)




#####################################################################


#######################################################################################

## Expenditures


Data <- read.dta13("C:/Users/ge79jas/Dokumente/Post Doc Goettingen/MaxART Economic Outcomes/Dataset/Expenditures_R.dta")

View(Data)


myvars <-  c("AforA" ,"totalexp_month" ,"female", "education" ,"married", "age" ,  "months_since_HIV_dx",  "months_since_ART_start", "period", "SiteCode")
Data <- Data[myvars]
Data <- na.omit(Data)

View(Data)
install.packages("grf")
library("grf")


install.packages("tidyverse")
library("tidyverse")

## Make A forest


cf <- causal_forest(
  X = model.matrix(~ ., data = Data[, -c(1,2)], clusters = Data$SiteCode),
  Y = Data$totalexp_month,
  W = as.numeric(Data$AforA) - 1,
  num.trees = 5000,
  seed = 1839
)

## Make predictions

preds <- predict(
  object = cf, 
  newdata = model.matrix(~ ., data = Data[, -c(1,2)]), 
  estimate.variance = TRUE
)


Data$preds <- preds$predictions 


## Explore the Nature of Heterogeneity
# What variables affect variations in treatment effects? Rank-order their importance 
cf %>% 
  variable_importance() %>% 
  as.data.frame() %>% 
  mutate(variable = colnames(cf$X.orig)) %>% 
  arrange(desc(V1))

install.packages("ggplot2")
library(ggplot2)
install.packages("cowplot")
library(cowplot)

p1 <- ggplot(Data, aes(x =months_since_ART_start, y = preds)) +
  geom_point() +
  geom_smooth(method = "loess", span = 1) +
  theme_light()
p2 <- ggplot(Data, aes(x = months_since_HIV_dx, y = preds)) +
  geom_point() +
  geom_smooth(method = "loess", span = 1) +
  theme_light()

p3 <- ggplot(Data, aes(x = age, y = preds)) +
  geom_point() +
  geom_smooth(method = "loess", span = 1) +
  theme_light()

p4 <- ggplot(Data, aes(x = education, y = preds)) +
  geom_point() +
  geom_smooth(method = "loess", span = 1) +
  theme_light()

library("cowplot")
cowplot::plot_grid(p1, p2, p3, p4)




#####################################################################


## EMPLOYMENT

Data <- read.dta13("C:/Users/ge79jas/Dokumente/Post Doc Goettingen/MaxART Economic Outcomes/Dataset/Employment_R.dta")

View(Data)


myvars <-  c("AforA" ,"incomegen" ,"female", "education" ,"married", "age" ,  "months_since_HIV_dx",  "months_since_ART_start", "period", "SiteCode")
Data <- Data[myvars]
Data <- na.omit(Data)
View(Data)



View(Data)
install.packages("grf")
library("grf")


install.packages("tidyverse")
library("tidyverse")

## Make A forest


cf <- causal_forest(
  X = model.matrix(~ ., data = Data[, -c(1,2)], clusters = Data$SiteCode),
  Y = Data$incomegen,
  W = as.numeric(Data$AforA) - 1,
  num.trees = 5000,
  seed = 1839
)

## Make predictions

new.dat <- data.frame()

preds <- predict(
  object = cf, 
  newdata = model.matrix(~ ., data = Data[, -c(1,2)]), 
  estimate.variance = TRUE
)



Data$preds <- preds$predictions 


## Explore the Nature of Heterogeneity
# What variables affect variations in treatment effects? Rank-order their importance 
cf %>% 
  variable_importance() %>% 
  as.data.frame() %>% 
  mutate(variable = colnames(cf$X.orig)) %>% 
  arrange(desc(V1))

install.packages("ggplot2")
library(ggplot2)
install.packages("cowplot")
library(cowplot)

p1 <- ggplot(Data, aes(x =months_since_ART_start, y = preds)) +
  geom_point() +
  geom_smooth(method = "loess", span = 1) +
  theme_light()
p2 <- ggplot(Data, aes(x = months_since_HIV_dx, y = preds)) +
  geom_point() +
  geom_smooth(method = "loess", span = 1) +
  theme_light()

p3 <- ggplot(Data, aes(x = age, y = preds)) +
  geom_point() +
  geom_smooth(method = "loess", span = 1) +
  theme_light()

p4 <- ggplot(Data, aes(x = education, y = preds)) +
  geom_point() +
  geom_smooth(method = "loess", span = 1) +
  theme_light()

library("cowplot")
cowplot::plot_grid(p1, p2, p3, p4)


##################################################################

###### Assets


Data <- read.dta13("C:/Users/ge79jas/Dokumente/Post Doc Goettingen/MaxART Economic Outcomes/Dataset/Assets_R.dta")

View(Data)


myvars <-  c("AforA" ,"assetsum" ,"female", "education" ,"married", "age" ,  "months_since_HIV_dx",  "months_since_ART_start", "period", "SiteCode")
Data <- Data[myvars]
Data <- na.omit(Data)
View(Data)

 
library("grf")



## Make A forest


cf <- causal_forest(
  X = model.matrix(~ ., data = Data[, -c(1,2)], clusters = Data$SiteCode),
  Y = Data$assetsum,
  W = as.numeric(Data$AforA) - 1,
  num.trees = 5000,
  seed = 1839
)

## Make predictions

new.dat <- data.frame()

preds <- predict(
  object = cf, 
  newdata = model.matrix(~ ., data = Data[, -c(1,2)]), 
  estimate.variance = TRUE
)


Data$preds <- preds$predictions 


## Explore the Nature of Heterogeneity
# What variables affect variations in treatment effects? Rank-order their importance 

install.packages("tidyverse")
library("tidyverse")

cf %>% 
  variable_importance() %>% 
  as.data.frame() %>% 
  mutate(variable = colnames(cf$X.orig)) %>% 
  arrange(desc(V1))

install.packages("ggplot2")
library(ggplot2)
install.packages("cowplot")
library(cowplot)

p1 <- ggplot(Data, aes(x =months_since_ART_start, y = preds)) +
  geom_point() +
  geom_smooth(method = "loess", span = 1) +
  theme_light()
p2 <- ggplot(Data, aes(x = months_since_HIV_dx, y = preds)) +
  geom_point() +
  geom_smooth(method = "loess", span = 1) +
  theme_light()

p3 <- ggplot(Data, aes(x = age, y = preds)) +
  geom_point() +
  geom_smooth(method = "loess", span = 1) +
  theme_light()

p4 <- ggplot(Data, aes(x = education, y = preds)) +
  geom_point() +
  geom_smooth(method = "loess", span = 1) +
  theme_light()

library("cowplot")
cowplot::plot_grid(p1, p2, p3, p4)

 


 
  ##############################################
#### Susan Athey

## Pilot Random Forest
Y.forest = regression_forest(Data$AforA, Data$incomegen)
Y.hat = predict (Y. forest )$ predictions
W. forest = regression _ forest (X, W, clusters = school .id)
W.hat = predict (W. forest )$ predictions
cf.raw = causal _ forest (X, Y, W,
                          Y.hat = Y.hat , W.hat = W.hat ,
                          clusters = school .id)
varimp = variable _ importance (cf.raw )
selected .idx = which ( varimp > mean ( varimp ))
cf = causal _ forest (X[, selected .idx ], Y, W,
                      Y.hat = Y.hat , W.hat = W.hat ,
                      clusters = school .id ,
                      samples _per _ cluster = 50,
                      tune . parameters = TRUE )
tau.hat = predict (cf )$ predictions

##Start with omnibus test
#

# Compare regions with high and low estimated CATEs
high_effect = tau.hat > median (tau.hat )
ate. high = average _ treatment _ effect (cf , subset = high _ effect )
ate.low = average _ treatment _ effect (cf , subset = ! high _ effect )
paste ("95% CI for difference in ATE:",
       round (ate. high [1] - ate.low[1], 3), "+/-",
       round ( qnorm (0.975) * sqrt (ate. high [2]^2 + ate.low[2]^2), 3))
> "95% CI for difference in ATE : 0.053 +/- 0.071"
# Run best linear predictor analysis
test _ calibration (cf)


## make test and training dataset

set.seed(1234)
cases <- sample(seq_len(nrow(Data)), round(nrow(Data) * .6))
train <- Data[cases, ]
test <- Data[-cases, ]

View(train)





# train

View(Data)

cf <- causal_forest(
  X = model.matrix(~ ., data = Data[, -c(1,2)]),
  Y = Data$incomegen,
  W = as.numeric(Data$AforA) - 1,
  num.trees = 5000,
  seed = 1839
)

#ATE
ATE <-  average_treatment_effects(cf)
CI <- c(round(ATE[1], 3) - round(qnorm(0.975) * ATE[2], 3) + round(qnorm(0.975) *ATE[2], 3))
names(CI) <- c("lower", "upper")





# predict
new.dat <- data.frame()

preds <- predict(
  object = cf, 
  newdata = model.matrix(~ ., data = test[, -c(1,2)]), 
  estimate.variance = TRUE
)

preds <- predict(
  object = cf, 
  newdata = model.matrix(~ ., data = train[, -c(1,2)]), 
  estimate.variance = TRUE
)

test$preds <- preds$predictions
train$preds <- preds$predictions


##Heterogeneity for Test
high.effect <- train$preds>median(train$preds)
ate.high =average_treatment_effect(cf, subset=high.effect)
ate.low=average_treatment_effect(cf, subset=! high.effect)


